7.) Simulation by R

a.)

x <- seq(1, 100)
x
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [91]  91  92  93  94  95  96  97  98  99 100

b.)

w <- 2 + 0.5*x
w
##   [1]  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5  8.0  8.5  9.0  9.5
##  [16] 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0 15.5 16.0 16.5 17.0
##  [31] 17.5 18.0 18.5 19.0 19.5 20.0 20.5 21.0 21.5 22.0 22.5 23.0 23.5 24.0 24.5
##  [46] 25.0 25.5 26.0 26.5 27.0 27.5 28.0 28.5 29.0 29.5 30.0 30.5 31.0 31.5 32.0
##  [61] 32.5 33.0 33.5 34.0 34.5 35.0 35.5 36.0 36.5 37.0 37.5 38.0 38.5 39.0 39.5
##  [76] 40.0 40.5 41.0 41.5 42.0 42.5 43.0 43.5 44.0 44.5 45.0 45.5 46.0 46.5 47.0
##  [91] 47.5 48.0 48.5 49.0 49.5 50.0 50.5 51.0 51.5 52.0

c.)

set.seed(123)
myNormDist <- rnorm(100, sd = 5)
cat("The mean is ", mean(myNormDist), "\n")
## The mean is  0.4520295
cat("The standard deviation is ", sd(myNormDist), "\n")
## The standard deviation is  4.564079
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
Norm.df <- data.frame(myNormDist)

ggplotly(ggplot(data = Norm.df, aes(x = myNormDist)) + geom_histogram(bins = 15, fill = "#8a5789"))

We notice that even though we create a normal distribution with defined mean and standard deviation, we observe a differing sample mean and standard deviation. Observing our histogram, we also can see that our distribution is not perfectly symmetric.

d.)

y <- myNormDist + w
y
##   [1] -0.3023782  1.8491126 11.2935416  4.3525420  5.1464387 13.5753249
##   [7]  7.8045810 -0.3253062  3.0657357  4.7716901 13.6204090  9.7990691
##  [13] 10.5038573  9.5534136  6.7207943 18.9345657 12.9892524  1.1669142
##  [19] 15.0067795  9.6360430  7.1608815 11.9101254  8.3699778 10.3555439
##  [25] 11.3748037  6.5665334 19.6889352 16.7668656 10.8093153 23.2690746
##  [31] 19.6323211 16.5246426 22.9756283 23.3906674 23.6079054 23.4432013
##  [37] 23.2695883 20.6904414 19.9701867 20.0976450 19.0264651 21.9604136
##  [43] 17.1730182 34.8447798 30.5398100 19.3844571 23.4855758 23.6667232
##  [49] 30.3998256 26.5831547 28.7665926 27.8572662 28.2856477 35.8430114
##  [55] 28.3711451 37.5823530 22.7562360 33.9230687 32.1192712 33.0797078
##  [61] 34.3981974 30.4883827 31.8339631 28.9071231 29.1410439 36.5176432
##  [67] 37.7410489 36.2650211 41.1113373 47.2504234 35.0448442 26.4541556
##  [73] 43.5286926 35.4539962 36.0599569 45.1278568 39.0761350 34.8964114
##  [79] 42.4065174 41.3055432 42.5288209 44.9264020 41.6466998 47.2218827
##  [85] 43.3975672 46.6589098 50.9841951 48.1759075 44.8703421 52.7440381
##  [91] 52.4675193 50.7419848 49.6936587 45.8604696 56.3032622 46.9987021
##  [97] 61.4366650 58.6630531 50.3214982 46.8678955

e.)

Norm.df["y"] <- y
names(Norm.df) <- c("x", "y")
head(Norm.df)
##            x          y
## 1 -2.8023782 -0.3023782
## 2 -1.1508874  1.8491126
## 3  7.7935416 11.2935416
## 4  0.3525420  4.3525420
## 5  0.6464387  5.1464387
## 6  8.5753249 13.5753249
# Base size for labels is 11
# https://ggplot2.tidyverse.org/reference/ggtheme.html
ggplotly(ggplot(data = Norm.df, aes(x = x, y = y)) + 
           geom_point(color = "#7eb6ff") + 
           ggtitle("Y Versus X") + 
           theme_grey(base_size = 11*1.5))

f.)

Beta1 <-
  sum((Norm.df[, 1] - mean(Norm.df[, 1])) * (Norm.df[, 2] - mean(Norm.df[, 2]))) / sum((Norm.df[, 1] - mean(Norm.df[, 1])) ^2)
Beta0 <- mean(Norm.df[,2]) - Beta1 * mean(Norm.df[, 1])

# Create column for predictions
Norm.df["Ypred"] <- Beta0 + Beta1 * Norm.df["x"]


ggplotly(ggplot(data = Norm.df, aes(x = x, y = y)) + 
           geom_point(color = "#7eb6ff") + 
            geom_line(data = Norm.df, aes(x = x, y = Ypred)) + 
           ggtitle("Y Versus X") + 
           theme_grey(base_size = 11*1.5))
Norm.df["ei"] <- Norm.df["y"] - Norm.df["Ypred"]

ggplotly(ggplot(data = Norm.df, aes(x = x, y = ei)) + 
           geom_point(color = "#75a876") + 
           ggtitle("Residuals Versus X") + 
           theme_grey(base_size = 11*1.5))
cat("The MSE is", sum(Norm.df["ei"]^2) / (nrow(Norm.df) - 2))
## The MSE is 211.2099

h.)

#Empty lists for plots
plotList <- list()
plotList2 <- list()

for (i in seq(1,5)) {
  set.seed(i)
  xLoop <- rnorm(100, sd = 5)
  yLoop <- myNormDist + w
  
  Beta1Loop <-
  sum((xLoop - mean(xLoop)) * (yLoop - mean(yLoop))) / sum((xLoop - mean(xLoop))^2)
  Beta0Loop <- mean(yLoop) - Beta1 * mean(xLoop)
  

  Loop.df <- data.frame(xLoop, yLoop, Beta0Loop + Beta1Loop * xLoop)
  names(Loop.df) <- c("x", "y", "Ypred")
  
  #Creating residuals column
  Loop.df["ei"] <- Loop.df["y"] - Loop.df["Ypred"]
  
  #Plots for y versus x 
  p <- ggplotly(ggplot(data = Loop.df, aes(x = x, y = y)) + 
           geom_point(color = "#7eb6ff") +
            geom_line(data = Loop.df, aes(x = x, y = Ypred)) + 
           ggtitle("Y Versus X") + 
           theme_grey(base_size = 11*1.5))
  plotList[[i]] <- p
  #figList <- c(figList, Loop.df)
  
  #Plots for residuals versus X
  p2 <- ggplotly(ggplot(data = Loop.df, aes(x = x, y = ei)) + 
           geom_point(color = "#75a876") + 
           ggtitle("Residuals Versus X") + 
           theme_grey(base_size = 11*1.5))
  plotList2[[i]] <- p2
}


for (i in 1:5) {
  print(plotList[[i]])
  print(plotList2[[i]])
}

From the plots we can notice that the slop of the line changes between positive and negative while also being very close to zero. There is no repetitive pattern of the same observed line with each simulation. i.) (Optional Problem)

dfOpt <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(dfOpt) <- c("Beta0", "Beta1", "MSE")

for (i in 1:1000) {
  set.seed(i + 100)
  xOpt <- rnorm(100, sd = 5)
  yOpt <- myNormDist + w
  B1opt <- sum((xOpt - mean(xOpt)) * (yOpt - mean(yOpt))) / sum((yOpt - mean(yOpt))^2)
  B0opt <-mean(yOpt) - B1opt * mean(xOpt)
  YpredOpt <- B0opt + B1opt * xOpt
  MSEopt <- sum((yOpt - YpredOpt)^2) / length(xOpt) - 2
  
  #dfOpt <- rbind(dfOpt, c(B0opt, B1opt, MSEopt))
  dfOpt[i,] <- c(B0opt, B1opt, MSEopt)
  
}

head(dfOpt)
##      Beta0        Beta1      MSE
## 1 27.70726  0.028109893 237.0356
## 2 27.74315 -0.075121590 234.8550
## 3 27.70249 -0.009752073 237.3534
## 4 27.68854  0.040352463 236.6502
## 5 27.70355  0.021481534 237.1844
## 6 27.70091 -0.006764465 237.3759
ggplotly(ggplot(data = dfOpt, aes(x = Beta0)) + geom_histogram(fill = "#4b0082"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("The mean of Beta0 is", mean(dfOpt[,1]), "and the variance is", var(dfOpt[,1]), ".")
## The mean of Beta0 is 27.70186 and the variance is 0.0002372986 .
ggplotly(ggplot(data = dfOpt, aes(x = Beta1)) + geom_histogram(fill = "#00824b"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("The mean of Beta1 is", mean(dfOpt[,2]), "and the variance is", var(dfOpt[,2]), ".")
## The mean of Beta1 is -5.663902e-05 and the variance is 0.001043213 .
ggplotly(ggplot(data = dfOpt, aes(x = MSE)) + geom_histogram(fill = "#824b00"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("The mean of MSE is", mean(dfOpt[,3]), "and the variance is", var(dfOpt[,3]), ".")
## The mean of MSE is 236.9243 and the variance is 0.4521359 .

We can see from the the simulation of a thousand samples that our distributions for both \(\beta_0\) and \(\beta_1\) are approximately normal, while the distribution of our MSE is strongly left skewed. The means of all estimators approach the true value (which we define by the simulation). Between the two least squares coefficients, \(\beta_0\) has less spread than \(\beta_1\).